16S-rDNA-V3-V4 repository: https://github.com/ycl6/16S-rDNA-V3-V4

LEfSe: Bitbucket, Paper

export2graphlan: GitHub, export2graphlan

GraPhlAn: GitHub, Documentation, Paper

Demo Dataset: PRJEB27564 from Gut Microbiota in Parkinson’s Disease: Temporal Stability and Relations to Disease Progression. EBioMedicine. 2019;44:691-707

License: GPL-3.0


Workflow (Continues from Part 2)

Note: Requires Python 2

Start R

cd /ngs/16S-Demo/PRJEB27564

R

Load package and set path

library("data.table")
library("phyloseq")
library("ggplot2")
library("dplyr")
library("grid")
Sys.setlocale("LC_COLLATE", "C")
## [1] "C"

Set the full-path to additional scripts

source("/path-to-script/lefse.R", local = TRUE)

Load saved workspace from Part 2

load("2_phyloseq_tutorial.RData")

Create a folder lefse

if(!dir.exists("lefse")) { dir.create("lefse") }

Prepare input data

Subset samples

# Patient vs. control at baseline
ps1a = prune_samples(sample_names(ps1)[grep("B$", sample_names(ps1))], ps1)
ps1a = prune_taxa(taxa_sums(ps1a) > 0, ps1a)

# Patient followup vs. patient baseline
ps1b = prune_samples(sample_names(ps1)[grep("^P", sample_names(ps1))], ps1)
ps1b = prune_taxa(taxa_sums(ps1b) > 0, ps1b)

Prepare LEfSe input

# Patient vs. control at baseline
ps1a = prune_samples(sample_names(ps1)[grep("B$", sample_names(ps1))], ps1)
ps1a = prune_taxa(taxa_sums(ps1a) > 0, ps1a)

# Patient followup vs. patient baseline
ps1b = prune_samples(sample_names(ps1)[grep("^P", sample_names(ps1))], ps1)
ps1b = prune_taxa(taxa_sums(ps1b) > 0, ps1b)

tax1 = lefse_1name_obj(ps1a, sample_data(ps1a)$Group)
lefse1 = lefse_obj(ps1a)
lefse1 = rbind(tax1, lefse1)

tax2 = lefse_1name_obj(ps1b, sample_data(ps1b)$Time)
lefse2 = lefse_obj(ps1b)
lefse2 = rbind(tax2, lefse2)

# Replace unsupported chars
lefse1$name = gsub("-","_",lefse1$name)
lefse1$name = gsub("/","_",lefse1$name)

lefse2$name = gsub("-","_",lefse2$name)
lefse2$name = gsub("/","_",lefse2$name) 

For Silva v132 users, apply below to fix identifical phylum & class names (Actinobacteria and Deferribacteres)

lefse1 = fix_taxa_silva132(lefse1)
lefse2 = fix_taxa_silva132(lefse2)

Output to file

write.table(lefse1, file = "lefse/expr1.lefse_table.txt", sep = "\t", quote = F, 
        row.names = F, col.names = F)
write.table(lefse2, file = "lefse/expr2.lefse_table.txt", sep = "\t", quote = F, 
        row.names = F, col.names = F)

Perform LEfSe analysis

We use linear discriminant analysis (LDA) effect size (LEfSe) to determines the features (in this case the clades at each taxonomic rank) most likely to explain the differences between Parkinson’s patients and control subjects

Set the full-path to LEfSe folder

nsegata_lefse = "/path-to-script/nsegata-lefse-9adc3a62460e/"

Run LEfSe

system2(paste0(nsegata_lefse, "format_input.py"), 
    args = c("lefse/expr1.lefse_table.txt", "lefse/expr1.lefse_table.in", 
         "-c 1", "-u 2", "-o 1000000"))

system2(paste0(nsegata_lefse, "format_input.py"), 
    args = c("lefse/expr2.lefse_table.txt", "lefse/expr2.lefse_table.in", 
         "-c 1", "-u 2", "-o 1000000"))

# set KW alpha to 1 to allow returning of all P-value to perform adjustment later
system2(paste0(nsegata_lefse, "run_lefse.py"), 
    args = c("lefse/expr1.lefse_table.in", "lefse/expr1.lefse_table.res", 
         "-b 100", "-a 1", "-l 2"), stdout = TRUE)
## [1] "Number of significantly discriminative features: 33 ( 252 ) before internal wilcoxon"
## [2] "Number of discriminative features with abs LDA score > 2.0 : 32"
system2(paste0(nsegata_lefse, "run_lefse.py"), 
    args = c("lefse/expr2.lefse_table.in", "lefse/expr2.lefse_table.res", 
         "-b 100", "-a 1", "-l 2"), stdout = TRUE)
## [1] "Number of significantly discriminative features: 9 ( 251 ) before internal wilcoxon"
## [2] "Number of discriminative features with abs LDA score > 2.0 : 9"

Multiple testing correction

Perform multiple testing correction on KW P-values

# set fdr threshold at 0.1
q = 0.1

expr1 = data.frame(fread("lefse/expr1.lefse_table.res"))
expr1 = pcorrection(expr1, q)

expr2 = data.frame(fread("lefse/expr2.lefse_table.res"))
expr2 = pcorrection(expr2, q)

# You can use table() to check number of taxa pass the threshold
table(expr1$V3 != "")
## 
## FALSE  TRUE 
##   222    32
table(expr2$V3 != "")
## 
## FALSE  TRUE 
##   245     9
# Write new result file with the P-value column replaced by the FDR
write.table(expr1[,c(1:4,6)], file = "lefse/expr1.lefse_table.res.padj", sep = "\t", quote = F, 
        row.names = F, col.names = F)
write.table(expr2[,c(1:4,6)], file = "lefse/expr2.lefse_table.res.padj", sep = "\t", quote = F, 
        row.names = F, col.names = F)

Plot cladogram

Set paths

Set the full-paths to export2graphlan, GraPhlAn and lefse.pl if they are not in PATH

export2graphlan = "/path-to-script/export2graphlan.py"
graphlan_annotat = "/path-to-script/graphlan_annotate.py"
graphlan = "/path-to-script/graphlan.py"
graphlan_parser = "/path-to-script/lefse.pl"

Run export2graphlan and GraPhlAn

Note: Update the coloring lefse/expr1.colors and lefse/expr2.colors if necessary. The colors should be defined in HSV (hue, saturation, value) scale

Patient vs. control at baseline (expr1)

system2(export2graphlan, 
    args = c("-i lefse/expr1.lefse_table.txt", "-o lefse/expr1.lefse_table.res.padj", 
         "-t lefse/expr1.graphlan_tree.txt", "-a lefse/expr1.graphlan_annot.txt", 
         "--external_annotations 2,3,4,5,6", "--fname_row 0", "--skip_rows 1", 
         "--biomarkers2colors lefse/expr1.colors"))

system2(graphlan_annotat, 
    args = c("--annot lefse/expr1.graphlan_annot.txt", 
         "lefse/expr1.graphlan_tree.txt", 
         "lefse/expr1.graphlan_outtree.txt"))

system2(graphlan, 
    args = c("--dpi 150", "lefse/expr1.graphlan_outtree.txt", "lefse/expr1.graphlan.png", 
         "--external_legends", "--size 8", "--pad 0.2"))
"lefse/expr1.graphlan.png"

“lefse/expr1.graphlan.png”

Patient at baseline vs. patient at followup (expr2)

system2(export2graphlan, 
        args = c("-i lefse/expr2.lefse_table.txt", "-o lefse/expr2.lefse_table.res.padj", 
                 "-t lefse/expr2.graphlan_tree.txt", "-a lefse/expr2.graphlan_annot.txt", 
                 "--external_annotations 2,3,4,5,6", "--fname_row 0", "--skip_rows 1", 
                 "--biomarkers2colors lefse/expr2.colors"))

system2(graphlan_annotat, 
        args = c("--annot lefse/expr2.graphlan_annot.txt", 
                 "lefse/expr2.graphlan_tree.txt",
                 "lefse/expr2.graphlan_outtree.txt"))

system2(graphlan, 
        args = c("--dpi 150", "lefse/expr2.graphlan_outtree.txt", "lefse/expr2.graphlan.png", 
                 "--external_legends", "--size 8", "--pad 0.2"))
"lefse/expr2.graphlan.png"

“lefse/expr2.graphlan.png”

Convert to readble output

system2("perl", args = c(graphlan_parser, "lefse/expr1.lefse_table.res.padj", 
             "lefse/expr1.graphlan_outtree.txt", "lefse/expr1.out"))

system2("perl", args = c(graphlan_parser, "lefse/expr2.lefse_table.res.padj", 
             "lefse/expr2.graphlan_outtree.txt", "lefse/expr2.out"))

Plot results

Load parsed results

Patient vs. control at baseline (expr1)

res1 = data.frame(fread("lefse/expr1.out"))
res1 = res1[order(res1$order),]
names(res1)[c(5:7)] = c("Group","LDA","FDR")
res1$taxon = paste0(res1$taxon, "(",res1$rank, ";", " ",res1$order, ")")
res1$taxon = as.factor(res1$taxon)
res1$taxon = factor(res1$taxon, levels = unique(res1$taxon[order(res1$order, decreasing = T)]))
res1$Group = as.factor(res1$Group)
levels(res1$taxon) = gsub("Escherichia_Shigella","Escherichia/Shigella",levels(res1$taxon))
levels(res1$taxon) = gsub("_UCG_","_UCG-",levels(res1$taxon))

Patient at baseline vs. patient at followup (expr2)

res2 = data.frame(fread("lefse/expr2.out"))
res2 = res2[order(res2$order),]
names(res2)[c(5:7)] = c("Group","LDA","FDR")
res2$taxon = paste0(res2$taxon, "(",res2$rank, ";", " ",res2$order, ")")
res2$taxon = as.factor(res2$taxon)
res2$taxon = factor(res2$taxon, levels = unique(res2$taxon[order(res2$order, decreasing = T)]))
res2$Group = as.factor(res2$Group)
levels(res2$taxon) = gsub("Escherichia_Shigella","Escherichia/Shigella",levels(res2$taxon))
levels(res2$taxon) = gsub("_UCG_","_UCG-",levels(res2$taxon))

Plot bar

Patient vs. control at baseline (expr1)

plot_lefse = ggplot(res1, aes(taxon, LDA, fill = Group)) + 
    geom_bar(stat = "identity", width = 0.7, size = 0.5) + coord_flip() + 
    theme_bw() + facet_wrap(~ Group, ncol = 1, scales = "free_y") + 
    scale_fill_manual(values = c("control" = "blue", "patient" = "red")) +
    theme(legend.position = "none", 
          axis.text.x = element_text(size = 12), 
          axis.text.y = element_text(face = "bold", size = 8), 
          strip.text.x = element_text(face = "bold", size = 12)) + 
    labs(title = "LEfSe Analysis (Baseline)", x = "Taxon", y = "LDA")

groupN = res1 %>% group_by(Group) %>% summarise(count = length(unique(taxon)))
## `summarise()` ungrouping output (override with `.groups` argument)
gt = ggplotGrob(plot_lefse)
panelI.1 = gt$layout$t[grepl("panel", gt$layout$name)]
gt$heights[panelI.1] = unit(groupN$count, "null")
invisible(dev.off())

png("lefse/expr1.lefse_table.png", width = 6, height = 6, units = "in", res = 300)
grid.draw(gt)
invisible(dev.off())
"lefse/expr1.lefse_table.png"

“lefse/expr1.lefse_table.png”

Patient at baseline vs. patient at followup (expr2)

plot_lefse = ggplot(res2, aes(taxon, LDA, fill = Group)) + 
        geom_bar(stat = "identity", width = 0.7, size = 0.5) + coord_flip() + 
        theme_bw() + facet_wrap(~ Group, ncol = 1, scales = "free_y") +  
        scale_fill_manual(values = c("baseline" = "blue", "followup" = "red")) +
        theme(legend.position = "none", 
              axis.text.x = element_text(size = 12), 
              axis.text.y = element_text(face = "bold", size = 8), 
              strip.text.x = element_text(face = "bold", size = 12)) + 
        labs(title = "LEfSe Analysis (followup)", x = "Taxon", y = "LDA")

groupN = res2 %>% group_by(Group) %>% summarise(count = length(unique(taxon)))
## `summarise()` ungrouping output (override with `.groups` argument)
gt = ggplotGrob(plot_lefse)
panelI.1 = gt$layout$t[grepl("panel", gt$layout$name)]
gt$heights[panelI.1] = unit(groupN$count, "null")
invisible(dev.off())

png("lefse/expr2.lefse_table.png", width = 6, height = 3, units = "in", res = 300)
grid.draw(gt)
invisible(dev.off())
"lefse/expr2.lefse_table.png"

“lefse/expr2.lefse_table.png”

Save current workspace

save.image(file = "3_lefse_tutorial.RData")

Session information

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/p27/lib/libopenblasp-r0.3.7.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C               LC_TIME=en_GB.UTF-8       
##  [4] LC_COLLATE=C               LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dplyr_1.0.0       ggplot2_3.3.2     phyloseq_1.30.0   data.table_1.12.8
## [5] knitr_1.29       
## 
## loaded via a namespace (and not attached):
##   [1] nlme_3.1-148                bitops_1.0-6                matrixStats_0.56.0         
##   [4] bit64_0.9-7                 RColorBrewer_1.1-2          GenomeInfoDb_1.22.1        
##   [7] tools_3.6.2                 backports_1.1.8             R6_2.4.1                   
##  [10] vegan_2.5-6                 rpart_4.1-15                DBI_1.1.0                  
##  [13] Hmisc_4.4-0                 BiocGenerics_0.32.0         mgcv_1.8-31                
##  [16] colorspace_1.4-1            permute_0.9-5               ade4_1.7-15                
##  [19] nnet_7.3-14                 withr_2.2.0                 tidyselect_1.1.0           
##  [22] gridExtra_2.3               DESeq2_1.26.0               bit_1.1-15.2               
##  [25] compiler_3.6.2              Biobase_2.46.0              htmlTable_2.0.1            
##  [28] DelayedArray_0.12.3         scales_1.1.1                checkmate_2.0.0            
##  [31] genefilter_1.68.0           stringr_1.4.0               digest_0.6.25              
##  [34] Rsamtools_2.2.3             foreign_0.8-73              rmarkdown_2.3              
##  [37] dada2_1.14.1                XVector_0.26.0              base64enc_0.1-3            
##  [40] jpeg_0.1-8.1                pkgconfig_2.0.3             htmltools_0.5.0            
##  [43] highr_0.8                   htmlwidgets_1.5.1           rlang_0.4.6                
##  [46] RSQLite_2.2.0               rstudioapi_0.11             generics_0.0.2             
##  [49] hwriter_1.3.2               jsonlite_1.7.0              BiocParallel_1.20.1        
##  [52] acepack_1.4.1               RCurl_1.98-1.2              magrittr_1.5               
##  [55] GenomeInfoDbData_1.2.2      Formula_1.2-3               biomformat_1.14.0          
##  [58] Matrix_1.2-18               Rcpp_1.0.5                  munsell_0.5.0              
##  [61] S4Vectors_0.24.4            Rhdf5lib_1.8.0              ape_5.4                    
##  [64] lifecycle_0.2.0             stringi_1.4.6               yaml_2.2.1                 
##  [67] MASS_7.3-51.6               SummarizedExperiment_1.16.1 zlibbioc_1.32.0            
##  [70] rhdf5_2.30.1                plyr_1.8.6                  blob_1.2.1                 
##  [73] parallel_3.6.2              crayon_1.3.4                lattice_0.20-41            
##  [76] Biostrings_2.54.0           splines_3.6.2               annotate_1.64.0            
##  [79] multtest_2.42.0             locfit_1.5-9.4              pillar_1.4.5               
##  [82] igraph_1.2.5                GenomicRanges_1.38.0        geneplotter_1.64.0         
##  [85] reshape2_1.4.4              codetools_0.2-16            stats4_3.6.2               
##  [88] XML_3.98-1.20               glue_1.4.1                  evaluate_0.14              
##  [91] ShortRead_1.44.3            latticeExtra_0.6-29         RcppParallel_5.0.2         
##  [94] png_0.1-7                   vctrs_0.3.1                 foreach_1.5.0              
##  [97] gtable_0.3.0                purrr_0.3.4                 xfun_0.15                  
## [100] xtable_1.8-4                survival_3.2-3              tibble_3.0.2               
## [103] iterators_1.0.12            memoise_1.1.0               AnnotationDbi_1.48.0       
## [106] GenomicAlignments_1.22.1    IRanges_2.20.2              cluster_2.1.0              
## [109] ellipsis_0.3.1